home *** CD-ROM | disk | FTP | other *** search
/ EuroCD 3 / EuroCD 3.iso / Programming / vbcc / machines / amiga68k / libsrc / math / math_040 / exp.s < prev    next >
Encoding:
Text File  |  1998-06-24  |  39.2 KB  |  758 lines

  1. *
  2. *   $VER: exp.s 34.1 (19.1.97)
  3. *
  4. *   calculates the result of e to the power of x.
  5. *
  6. *   Version history:
  7. *
  8. *   33.1    19.1.97 laukkanen
  9. *
  10. *           - created (using MacLaurin series approximation with five iterations)
  11. *
  12. *   34.1    19.1.97 (c) Motorola
  13. *
  14. *           - unfortunately the result was only close to the current in the immediate
  15. *           - positive vicinity of zero, scrapped.
  16. *           - cut'n pasted from Motorola M68060SP
  17. *
  18. *   34.2    22.1.97 laukkanen
  19. *
  20. *           - trashed a6, fixed
  21. *
  22. *   34.3    23.1.97 (c) Motorola
  23. *
  24. *           - added expm1()
  25. *
  26. *   34.4    24.1.97 laukkanen
  27. *
  28. *           - in the converting process, I seem to have broken
  29. *             expm1(), fixed.
  30. *
  31. *   34.4    24.1.97 laukkanen
  32. *
  33. *           - exmp1() fixed again.
  34. *
  35.  
  36.     machine 68040
  37.     fpu     1
  38.  
  39.     XDEF    _exp
  40.     XDEF    @exp
  41.     XDEF    _expm1
  42.     XDEF    @expm1
  43.  
  44. * ALGORITHM and IMPLEMENTATION **************************************** *
  45. *                                                                       *
  46. *       exp()                                                           *
  47. *       -----                                                           *
  48. *                                                                       *
  49. *       Step 1. Filter out extreme cases of input argument.             *
  50. *               1.1     If |X| >= 2^(-65), go to Step 1.3.              *
  51. *               1.2     Go to Step 7.                                   *
  52. *               1.3     If |X| < 16380 log(2), go to Step 2.            *
  53. *               1.4     Go to Step 8.                                   *
  54. *       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.*
  55. *               To avoid the use of floating-point comparisons, a       *
  56. *               compact representation of |X| is used. This format is a *
  57. *               32-bit integer, the upper (more significant) 16 bits    *
  58. *               are the sign and biased exponent field of |X|; the      *
  59. *               lower 16 bits are the 16 most significant fraction      *
  60. *               (including the explicit bit) bits of |X|. Consequently, *
  61. *               the comparisons in Steps 1.1 and 1.3 can be performed   *
  62. *               by integer comparison. Note also that the constant      *
  63. *               16380 log(2) used in Step 1.3 is also in the compact    *
  64. *               form. Thus taking the branch to Step 2 guarantees       *
  65. *               |X| < 16380 log(2). There is no harm to have a small    *
  66. *               number of cases where |X| is less than, but close to,   *
  67. *               16380 log(2) and the branch to Step 9 is taken.         *
  68. *                                                                       *
  69. *       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).      *
  70. *               2.1     Set AdjFlag := 0 (indicates the branch 1.3 -> 2 *
  71. *                       was taken)                                      *
  72. *               2.2     N := round-to-nearest-integer( X * 64/log2 ).   *
  73. *               2.3     Calculate       J = N mod 64; so J = 0,1,2,..., *
  74. *                       or 63.                                          *
  75. *               2.4     Calculate       M = (N - J)/64; so N = 64M + J. *
  76. *               2.5     Calculate the address of the stored value of    *
  77. *                       2^(J/64).                                       *
  78. *               2.6     Create the value Scale = 2^M.                   *
  79. *       Notes:  The calculation in 2.2 is really performed by           *
  80. *                       Z := X * constant                               *
  81. *                       N := round-to-nearest-integer(Z)                *
  82. *               where                                                   *
  83. *                       constant := single-precision( 64/log 2 ).       *
  84. *                                                                       *
  85. *               Using a single-precision constant avoids memory         *
  86. *               access. Another effect of using a single-precision      *
  87. *               "constant" is that the calculated value Z is            *
  88. *                                                                       *
  89. *                       Z = X*(64/log2)*(1+eps), |eps| <= 2^(-24).      *
  90. *                                                                       *
  91. *               This error has to be considered later in Steps 3 and 4. *
  92. *                                                                       *
  93. *       Step 3. Calculate X - N*log2/64.                                *
  94. *               3.1     R := X + N*L1,                                  *
  95. *                               where L1 := single-precision(-log2/64). *
  96. *               3.2     R := R + N*L2,                                  *
  97. *                               L2 := extended-precision(-log2/64 - L1).*
  98. *       Notes:  a) The way L1 and L2 are chosen ensures L1+L2           *
  99. *               approximate the value -log2/64 to 88 bits of accuracy.  *
  100. *               b) N*L1 is exact because N is no dc.ler than 22 bits    *
  101. *               and L1 is no dc.ler than 24 bits.                       *
  102. *               c) The calculation X+N*L1 is also exact due to          *
  103. *               cancellation. Thus, R is practically X+N(L1+L2) to full *
  104. *               64 bits.                                                *
  105. *               d) It is important to estimate how large can |R| be     *
  106. *               after Step 3.2.                                         *
  107. *                                                                       *
  108. *               N = rnd-to-int( X*64/log2 (1+eps) ), |eps|<=2^(-24)     *
  109. *               X*64/log2 (1+eps)       =       N + f,  |f| <= 0.5      *
  110. *               X*64/log2 - N   =       f - eps*X 64/log2               *
  111. *               X - N*log2/64   =       f*log2/64 - eps*X               *
  112. *                                                                       *
  113. *                                                                       *
  114. *               Now |X| <= 16446 log2, thus                             *
  115. *                                                                       *
  116. *                       |X - N*log2/64| <= (0.5 + 16446/2^(18))*log2/64 *
  117. *                                       <= 0.57 log2/64.                *
  118. *                This bound will be used in Step 4.                     *
  119. *                                                                       *
  120. *       Step 4. Approximate exp(R)-1 by a polynomial                    *
  121. *               p = R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))      *
  122. *       Notes:  a) In order to reduce memory access, the coefficients   *
  123. *               are made as "short" as possible: A1 (which is 1/2), A4  *
  124. *               and A5 are single precision; A2 and A3 are double       *
  125. *               precision.                                              *
  126. *               b) Even with the restrictions above,                    *
  127. *                  |p - (exp(R)-1)| < 2^(-68.8) for all |R| <= 0.0062.  *
  128. *               Note that 0.0062 is slightly bigger than 0.57 log2/64.  *
  129. *               c) To fully utilize the pipeline, p is separated into   *
  130. *               two independent pieces of roughly equal complexities    *
  131. *                       p = [ R + R*S*(A2 + S*A4) ]     +               *
  132. *                               [ S*(A1 + S*(A3 + S*A5)) ]              *
  133. *               where S = R*R.                                          *
  134. *                                                                       *
  135. *       Step 5. Compute 2^(J/64)*exp(R) = 2^(J/64)*(1+p) by             *
  136. *                               ans := T + ( T*p + t)                   *
  137. *               where T and t are the stored values for 2^(J/64).       *
  138. *       Notes:  2^(J/64) is stored as T and t where T+t approximates    *
  139. *               2^(J/64) to roughly 85 bits; T is in extended precision *
  140. *               and t is in single precision. Note also that T is       *
  141. *               rounded to 62 bits so that the last two bits of T are   *
  142. *               zero. The reason for such a special form is that T-1,   *
  143. *               T-2, and T-8 will all be exact --- a property that will *
  144. *               give much more accurate computation of the function     *
  145. *               EXPM1.                                                  *
  146. *                                                                       *
  147. *       Step 6. Reconstruction of exp(X)                                *
  148. *                       exp(X) = 2^M * 2^(J/64) * exp(R).               *
  149. *               6.1     If AdjFlag = 0, go to 6.3                       *
  150. *               6.2     ans := ans * AdjScale                           *
  151. *               6.3     Restore the user FPCR                           *
  152. *               6.4     Return ans := ans * Scale. Exit.                *
  153. *       Notes:  If AdjFlag = 0, we have X = Mlog2 + Jlog2/64 + R,       *
  154. *               |M| <= 16380, and Scale = 2^M. Moreover, exp(X) will    *
  155. *               neither overflow nor underflow. If AdjFlag = 1, that    *
  156. *               means that                                              *
  157. *                       X = (M1+M)log2 + Jlog2/64 + R, |M1+M| >= 16380. *
  158. *               Hence, exp(X) may overflow or underflow or neither.     *
  159. *               When that is the case, AdjScale = 2^(M1) where M1 is    *
  160. *               approximately M. Thus 6.2 will never cause              *
  161. *               over/underflow. Possible exception in 6.4 is overflow   *
  162. *               or underflow. The inexact exception is not generated in *
  163. *               6.4. Although one can argue that the inexact flag       *
  164. *               should always be raised, to simulate that exception     *
  165. *               cost to much than the flag is worth in practical uses.  *
  166. *                                                                       *
  167. *       Step 7. Return 1 + X.                                           *
  168. *               7.1     ans := X                                        *
  169. *               7.2     Restore user FPCR.                              *
  170. *               7.3     Return ans := 1 + ans. Exit                     *
  171. *       Notes:  For non-zero X, the inexact exception will always be    *
  172. *               raised by 7.3. That is the only exception raised by 7.3.*
  173. *               Note also that we use the FMOVEM instruction to move X  *
  174. *               in Step 7.1 to avoid unnecessary trapping. (Although    *
  175. *               the FMOVEM may not seem relevant since X is normalized, *
  176. *               the precaution will be useful in the library version of *
  177. *               this code where the separate entry for denormalized     *
  178. *               inputs will be done away with.)                         *
  179. *                                                                       *
  180. *       Step 8. Handle exp(X) where |X| >= 16380log2.                   *
  181. *               8.1     If |X| > 16480 log2, go to Step 9.              *
  182. *               (mimic 2.2 - 2.6)                                       *
  183. *               8.2     N := round-to-integer( X * 64/log2 )            *
  184. *               8.3     Calculate J = N mod 64, J = 0,1,...,63          *
  185. *               8.4     K := (N-J)/64, M1 := truncate(K/2), M = K-M1,   *
  186. *                       AdjFlag := 1.                                   *
  187. *               8.5     Calculate the address of the stored value       *
  188. *                       2^(J/64).                                       *
  189. *               8.6     Create the values Scale = 2^M, AdjScale = 2^M1. *
  190. *               8.7     Go to Step 3.                                   *
  191. *       Notes:  Refer to notes for 2.2 - 2.6.                           *
  192. *                                                                       *
  193. *       Step 9. Handle exp(X), |X| > 16480 log2.                        *
  194. *               9.1     If X < 0, go to 9.3                             *
  195. *               9.2     ans := Huge, go to 9.4                          *
  196. *               9.3     ans := Tiny.                                    *
  197. *               9.4     Restore user FPCR.                              *
  198. *               9.5     Return ans := ans * ans. Exit.                  *
  199. *       Notes:  Exp(X) will surely overflow or underflow, depending on  *
  200. *               X's sign. "Huge" and "Tiny" are respectively large/tiny *
  201. *               extended-precision numbers whose square over/underflow  *
  202. *               with an inexact result. Thus, 9.5 always raises the     *
  203. *               inexact together with either overflow or underflow.     *
  204. *************************************************************************
  205.  
  206.         cnop            0,4
  207.  
  208. L2      dc.l            $3FDC0000,$82E30865,$4361C4C6,$00000000
  209.  
  210. EEXPA3  dc.l            $3FA55555,$55554CC1
  211. EEXPA2  dc.l            $3FC55555,$55554A54
  212.  
  213. EM1A4   dc.l            $3F811111,$11174385
  214. EM1A3   dc.l            $3FA55555,$55554F5A
  215.  
  216. EM1A2   dc.l            $3FC55555,$55555555,$00000000,$00000000
  217.  
  218. EM1B8   dc.l            $3EC71DE3,$A5774682
  219. EM1B7   dc.l            $3EFA01A0,$19D7CB68
  220.  
  221. EM1B6   dc.l            $3F2A01A0,$1A019DF3
  222. EM1B5   dc.l            $3F56C16C,$16C170E2
  223.  
  224. EM1B4   dc.l            $3F811111,$11111111
  225. EM1B3   dc.l            $3FA55555,$55555555
  226.  
  227. EM1B2   dc.l            $3FFC0000,$AAAAAAAA,$AAAAAAAB
  228.         dc.l            $00000000
  229.  
  230. TWO140  dc.l            $48B00000,$00000000
  231. TWON140
  232.         dc.l            $37300000,$00000000
  233.  
  234. EEXPTBL
  235.         dc.l            $3FFF0000,$80000000,$00000000,$00000000
  236.         dc.l            $3FFF0000,$8164D1F3,$BC030774,$9F841A9B
  237.         dc.l            $3FFF0000,$82CD8698,$AC2BA1D8,$9FC1D5B9
  238.         dc.l            $3FFF0000,$843A28C3,$ACDE4048,$A0728369
  239.         dc.l            $3FFF0000,$85AAC367,$CC487B14,$1FC5C95C
  240.         dc.l            $3FFF0000,$871F6196,$9E8D1010,$1EE85C9F
  241.         dc.l            $3FFF0000,$88980E80,$92DA8528,$9FA20729
  242.         dc.l            $3FFF0000,$8A14D575,$496EFD9C,$A07BF9AF
  243.         dc.l            $3FFF0000,$8B95C1E3,$EA8BD6E8,$A0020DCF
  244.         dc.l            $3FFF0000,$8D1ADF5B,$7E5BA9E4,$205A63DA
  245.         dc.l            $3FFF0000,$8EA4398B,$45CD53C0,$1EB70051
  246.         dc.l            $3FFF0000,$9031DC43,$1466B1DC,$1F6EB029
  247.         dc.l            $3FFF0000,$91C3D373,$AB11C338,$A0781494
  248.         dc.l            $3FFF0000,$935A2B2F,$13E6E92C,$9EB319B0
  249.         dc.l            $3FFF0000,$94F4EFA8,$FEF70960,$2017457D
  250.         dc.l            $3FFF0000,$96942D37,$20185A00,$1F11D537
  251.         dc.l            $3FFF0000,$9837F051,$8DB8A970,$9FB952DD
  252.         dc.l            $3FFF0000,$99E04593,$20B7FA64,$1FE43087
  253.         dc.l            $3FFF0000,$9B8D39B9,$D54E5538,$1FA2A818
  254.         dc.l            $3FFF0000,$9D3ED9A7,$2CFFB750,$1FDE494D
  255.         dc.l            $3FFF0000,$9EF53260,$91A111AC,$20504890
  256.         dc.l            $3FFF0000,$A0B0510F,$B9714FC4,$A073691C
  257.         dc.l            $3FFF0000,$A2704303,$0C496818,$1F9B7A05
  258.         dc.l            $3FFF0000,$A43515AE,$09E680A0,$A0797126
  259.         dc.l            $3FFF0000,$A5FED6A9,$B15138EC,$A071A140
  260.         dc.l            $3FFF0000,$A7CD93B4,$E9653568,$204F62DA
  261.         dc.l            $3FFF0000,$A9A15AB4,$EA7C0EF8,$1F283C4A
  262.         dc.l            $3FFF0000,$AB7A39B5,$A93ED338,$9F9A7FDC
  263.         dc.l            $3FFF0000,$AD583EEA,$42A14AC8,$A05B3FAC
  264.         dc.l            $3FFF0000,$AF3B78AD,$690A4374,$1FDF2610
  265.         dc.l            $3FFF0000,$B123F581,$D2AC2590,$9F705F90
  266.         dc.l            $3FFF0000,$B311C412,$A9112488,$201F678A
  267.         dc.l            $3FFF0000,$B504F333,$F9DE6484,$1F32FB13
  268.         dc.l            $3FFF0000,$B6FD91E3,$28D17790,$20038B30
  269.         dc.l            $3FFF0000,$B8FBAF47,$62FB9EE8,$200DC3CC
  270.         dc.l            $3FFF0000,$BAFF5AB2,$133E45FC,$9F8B2AE6
  271.         dc.l            $3FFF0000,$BD08A39F,$580C36C0,$A02BBF70
  272.         dc.l            $3FFF0000,$BF1799B6,$7A731084,$A00BF518
  273.         dc.l            $3FFF0000,$C12C4CCA,$66709458,$A041DD41
  274.         dc.l            $3FFF0000,$C346CCDA,$24976408,$9FDF137B
  275.         dc.l            $3FFF0000,$C5672A11,$5506DADC,$201F1568
  276.         dc.l            $3FFF0000,$C78D74C8,$ABB9B15C,$1FC13A2E
  277.         dc.l            $3FFF0000,$C9B9BD86,$6E2F27A4,$A03F8F03
  278.         dc.l            $3FFF0000,$CBEC14FE,$F2727C5C,$1FF4907D
  279.         dc.l            $3FFF0000,$CE248C15,$1F8480E4,$9E6E53E4
  280.         dc.l            $3FFF0000,$D06333DA,$EF2B2594,$1FD6D45C
  281.         dc.l            $3FFF0000,$D2A81D91,$F12AE45C,$A076EDB9
  282.         dc.l            $3FFF0000,$D4F35AAB,$CFEDFA20,$9FA6DE21
  283.         dc.l            $3FFF0000,$D744FCCA,$D69D6AF4,$1EE69A2F
  284.         dc.l            $3FFF0000,$D99D15C2,$78AFD7B4,$207F439F
  285.         dc.l            $3FFF0000,$DBFBB797,$DAF23754,$201EC207
  286.         dc.l            $3FFF0000,$DE60F482,$5E0E9124,$9E8BE175
  287.         dc.l            $3FFF0000,$E0CCDEEC,$2A94E110,$20032C4B
  288.         dc.l            $3FFF0000,$E33F8972,$BE8A5A50,$2004DFF5
  289.         dc.l            $3FFF0000,$E5B906E7,$7C8348A8,$1E72F47A
  290.         dc.l            $3FFF0000,$E8396A50,$3C4BDC68,$1F722F22
  291.         dc.l            $3FFF0000,$EAC0C6E7,$DD243930,$A017E945
  292.         dc.l            $3FFF0000,$ED4F301E,$D9942B84,$1F401A5B
  293.         dc.l            $3FFF0000,$EFE4B99B,$DCDAF5CC,$9FB9A9E3
  294.         dc.l            $3FFF0000,$F281773C,$59FFB138,$20744C05
  295.         dc.l            $3FFF0000,$F5257D15,$2486CC2C,$1F773A19
  296.         dc.l            $3FFF0000,$F7D0DF73,$0AD13BB8,$1FFE90D5
  297.         dc.l            $3FFF0000,$FA83B2DB,$722A033C,$A041ED22
  298.         dc.l            $3FFF0000,$FD3E0C0C,$F486C174,$1F853F3A
  299.  
  300.  
  301. SCALE       EQU     -16
  302. ADJSCALE    EQU     -28
  303. SC          EQU     -16
  304. ONEBYSC     EQU     -28
  305. L_SCR1      EQU     -4
  306. TEMP_SIZE   EQU     28
  307.  
  308.  
  309. _exp
  310.         fmove.d         (4,sp),fp0
  311. @exp
  312.  
  313. ;--Step 2.
  314. ;--This is the normal branch:   2^(-65) <= |X| < 16380 log2.
  315.         fmove.s         fp0,-(sp)
  316.         move.w          (sp),d0
  317.         addq.l          #4,sp
  318.         and.w           #$7f80,d0
  319.         beq             .zero
  320.  
  321.         fmove.x         fp0,fp1
  322.         fmul.s          #$42B8AA3B,fp0          ; 64/log2 * X
  323.         link            a0,#-TEMP_SIZE
  324.         fmovem.x        fp2/fp3,-(sp)           ; save fp2 {fp2/fp3}
  325.         fmove.l         fp0,d1                  ; N = int( X * 64/log2 )
  326.         lea             (EEXPTBL,pc),a1
  327.         fmove.l         d1,fp0                  ; convert to floating-format
  328.  
  329.         move.l          d1,d0                   ; save N temporarily
  330.         and.l           #$3F,d1                 ; D0 is J = N mod 64
  331.         asr.l           #6,d0                   ; D0 is M
  332.         fmove.x         fp0,fp2
  333.         lsl.l           #4,d1
  334.         fmul.s          #$BC317218,fp0          ; N * L1, L1 = lead(-log2/64)
  335.         add.w           #$3FFF,d0               ; biased expo. of 2^(M)
  336.         add.l           d1,a1                   ; address of 2^(J/64)
  337.         move.w          (L2,pc),(L_SCR1,a0)     ; prefetch L2, no need in CB
  338.  
  339. .EXPCONT1
  340. ;--Step 3.
  341. ;--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
  342. ;--a0 points to 2^(J/64), D0 is biased expo. of 2^(M)
  343.         fmul.x          (L2,pc),fp2             ; N * L2, L1+L2 = -log2/64
  344.         fadd.x          fp1,fp0                 ; X + N*L1
  345.         fadd.x          fp2,fp0                 ; fp0 is R, reduced arg.
  346.  
  347. ;--Step 4.
  348. ;--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
  349. ;-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*A5))))
  350. ;--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
  351. ;--[R+R*S*(A2+S*A4)] + [S*(A1+S*(A3+S*A5))]
  352.  
  353.         fmove.x         fp0,fp1
  354.         fmul.x          fp1,fp1                 ; fp1 IS S = R*R
  355.  
  356.         fmove.s         #$3AB60B70,fp2          ; fp2 IS A5
  357.  
  358.         fmul.x          fp1,fp2                 ; fp2 IS S*A5
  359.         fmove.x         fp1,fp3
  360.         fmul.s          #$3C088895,fp3          ; fp3 IS S*A4
  361.  
  362.         fadd.d          (EEXPA3,pc),fp2         ; fp2 IS A3+S*A5
  363.         fadd.d          (EEXPA2,pc),fp3         ; fp3 IS A2+S*A4
  364.  
  365.         fmul.x          fp1,fp2                 ; fp2 IS S*(A3+S*A5)
  366.         move.w          d0,(SCALE,a0)           ; SCALE is 2^(M) in extended
  367.         move.l          #$80000000,(SCALE+4,a0)
  368.         clr.l           (SCALE+8,a0)
  369.  
  370.         fmul.x          fp1,fp3                 ; fp3 IS S*(A2+S*A4)
  371.  
  372.         fadd.s          #$3F000000,fp2          ; fp2 IS A1+S*(A3+S*A5)
  373.         fmul.x          fp0,fp3                 ; fp3 IS R*S*(A2+S*A4)
  374.  
  375.         fmul.x          fp1,fp2                 ; fp2 IS S*(A1+S*(A3+S*A5))
  376.         fadd.x          fp3,fp0                 ; fp0 IS R+R*S*(A2+S*A4),
  377.  
  378.         fmove.x         (a1)+,fp1               ; fp1 is lead. pt. of 2^(J/64)
  379.         fadd.x          fp2,fp0                 ; fp0 is EXP(R) - 1
  380.  
  381. ;--Step 5
  382. ;--final reconstruction process
  383. ;--EXP(X) = 2^M * ( 2^(J/64) + 2^(J/64)*(EXP(R)-1) )
  384.  
  385.         fmul.x          fp1,fp0                 ; 2^(J/64)*(Exp(R)-1)
  386.         fmovem.x        (sp)+,fp2/fp3           ; fp2 restored {fp2/fp3}
  387.         fadd.s          (a1),fp0                ; accurate 2^(J/64)
  388.  
  389.         fadd.x          fp1,fp0                 ; 2^(J/64) + 2^(J/64)*...
  390.  
  391. .NORMAL
  392.         fmul.x          (SCALE,a0),fp0          ; multiply 2^(M)
  393.         unlk            a0
  394.         rts
  395. .zero
  396.         fmove.s         #1,fp0
  397.         rts
  398.  
  399. *************************************************************************
  400. * expm1():  computes the exponential minus 1 for a normalized input     *
  401. *                                                                       *
  402. * INPUT *************************************************************** *
  403. *       fp0 = extended precision input                                  *
  404. *                                                                       *
  405. * OUTPUT ************************************************************** *
  406. *       fp0 = exp(X) or exp(X)-1                                        *
  407. *                                                                       *
  408. * ACCURACY and MONOTONICITY ******************************************* *
  409. *       The returned result is within 0.85 ulps in 64 significant bit,  *
  410. *       i.e. within 0.5001 ulp to 53 bits if the result is subsequently *
  411. *       rounded to double precision. The result is provably monotonic   *
  412. *       in double precision.                                            *
  413. *                                                                       *
  414. * ALGORITHM and IMPLEMENTATION **************************************** *
  415. *                                                                       *
  416. *       Step 1. Check |X|                                               *
  417. *               1.1     If |X| >= 1/4, go to Step 1.3.                  *
  418. *               1.2     Go to Step 7.                                   *
  419. *               1.3     If |X| < 70 log(2), go to Step 2.               *
  420. *               1.4     Go to Step 10.                                  *
  421. *       Notes:  The usual case should take the branches 1.1 -> 1.3 -> 2.*
  422. *               However, it is conceivable |X| can be small very often  *
  423. *               because EXPM1 is intended to evaluate exp(X)-1          *
  424. *               accurately when |X| is small. For further details on    *
  425. *               the comparisons, see the notes on Step 1 of setox.      *
  426. *                                                                       *
  427. *       Step 2. Calculate N = round-to-nearest-int( X * 64/log2 ).      *
  428. *               2.1     N := round-to-nearest-integer( X * 64/log2 ).   *
  429. *               2.2     Calculate       J = N mod 64; so J = 0,1,2,..., *
  430. *                       or 63.                                          *
  431. *               2.3     Calculate       M = (N - J)/64; so N = 64M + J. *
  432. *               2.4     Calculate the address of the stored value of    *
  433. *                       2^(J/64).                                       *
  434. *               2.5     Create the values Sc = 2^M and                  *
  435. *                       OnebySc := -2^(-M).                             *
  436. *       Notes:  See the notes on Step 2 of setox.                       *
  437. *                                                                       *
  438. *       Step 3. Calculate X - N*log2/64.                                *
  439. *               3.1     R := X + N*L1,                                  *
  440. *                               where L1 := single-precision(-log2/64). *
  441. *               3.2     R := R + N*L2,                                  *
  442. *                               L2 := extended-precision(-log2/64 - L1).*
  443. *       Notes:  Applying the analysis of Step 3 of setox in this case   *
  444. *               shows that |R| <= 0.0055 (note that |X| <= 70 log2 in   *
  445. *               this case).                                             *
  446. *                                                                       *
  447. *       Step 4. Approximate exp(R)-1 by a polynomial                    *
  448. *                       p = R+R*R*(A1+R*(A2+R*(A3+R*(A4+R*(A5+R*A6))))) *
  449. *       Notes:  a) In order to reduce memory access, the coefficients   *
  450. *               are made as "short" as possible: A1 (which is 1/2), A5  *
  451. *               and A6 are single precision; A2, A3 and A4 are double   *
  452. *               precision.                                              *
  453. *               b) Even with the restriction above,                     *
  454. *                       |p - (exp(R)-1)| <      |R| * 2^(-72.7)         *
  455. *               for all |R| <= 0.0055.                                  *
  456. *               c) To fully utilize the pipeline, p is separated into   *
  457. *               two independent pieces of roughly equal complexity      *
  458. *                       p = [ R*S*(A2 + S*(A4 + S*A6)) ]        +       *
  459. *                               [ R + S*(A1 + S*(A3 + S*A5)) ]          *
  460. *               where S = R*R.                                          *
  461. *                                                                       *
  462. *       Step 5. Compute 2^(J/64)*p by                                   *
  463. *                               p := T*p                                *
  464. *               where T and t are the stored values for 2^(J/64).       *
  465. *       Notes:  2^(J/64) is stored as T and t where T+t approximates    *
  466. *               2^(J/64) to roughly 85 bits; T is in extended precision *
  467. *               and t is in single precision. Note also that T is       *
  468. *               rounded to 62 bits so that the last two bits of T are   *
  469. *               zero. The reason for such a special form is that T-1,   *
  470. *               T-2, and T-8 will all be exact --- a property that will *
  471. *               be exploited in Step 6 below. The total relative error  *
  472. *               in p is no bigger than 2^(-67.7) compared to the final  *
  473. *               result.                                                 *
  474. *                                                                       *
  475. *       Step 6. Reconstruction of exp(X)-1                              *
  476. *                       exp(X)-1 = 2^M * ( 2^(J/64) + p - 2^(-M) ).     *
  477. *               6.1     If M <= 63, go to Step 6.3.                     *
  478. *               6.2     ans := T + (p + (t + OnebySc)). Go to 6.6       *
  479. *               6.3     If M >= -3, go to 6.5.                          *
  480. *               6.4     ans := (T + (p + t)) + OnebySc. Go to 6.6       *
  481. *               6.5     ans := (T + OnebySc) + (p + t).                 *
  482. *               6.6     Restore user FPCR.                              *
  483. *               6.7     Return ans := Sc * ans. Exit.                   *
  484. *       Notes:  The various arrangements of the expressions give        *
  485. *               accurate evaluations.                                   *
  486. *                                                                       *
  487. *       Step 7. exp(X)-1 for |X| < 1/4.                                 *
  488. *               7.1     If |X| >= 2^(-65), go to Step 9.                *
  489. *               7.2     Go to Step 8.                                   *
  490. *                                                                       *
  491. *       Step 8. Calculate exp(X)-1, |X| < 2^(-65).                      *
  492. *               8.1     If |X| < 2^(-16312), goto 8.3                   *
  493. *               8.2     Restore FPCR; return ans := X - 2^(-16382).     *
  494. *                       Exit.                                           *
  495. *               8.3     X := X * 2^(140).                               *
  496. *               8.4     Restore FPCR; ans := ans - 2^(-16382).          *
  497. *                Return ans := ans*2^(140). Exit                        *
  498. *       Notes:  The idea is to return "X - tiny" under the user         *
  499. *               precision and rounding modes. To avoid unnecessary      *
  500. *               inefficiency, we stay away from denormalized numbers    *
  501. *               the best we can. For |X| >= 2^(-16312), the             *
  502. *               straightforward 8.2 generates the inexact exception as  *
  503. *               the case warrants.                                      *
  504. *                                                                       *
  505. *       Step 9. Calculate exp(X)-1, |X| < 1/4, by a polynomial          *
  506. *                       p = X + X*X*(B1 + X*(B2 + ... + X*B12))         *
  507. *       Notes:  a) In order to reduce memory access, the coefficients   *
  508. *               are made as "short" as possible: B1 (which is 1/2), B9  *
  509. *               to B12 are single precision; B3 to B8 are double        *
  510. *               precision; and B2 is double extended.                   *
  511. *               b) Even with the restriction above,                     *
  512. *                       |p - (exp(X)-1)| < |X| 2^(-70.6)                *
  513. *               for all |X| <= 0.251.                                   *
  514. *               Note that 0.251 is slightly bigger than 1/4.            *
  515. *               c) To fully preserve accuracy, the polynomial is        *
  516. *               computed as                                             *
  517. *                       X + ( S*B1 +    Q ) where S = X*X and           *
  518. *                       Q       =       X*S*(B2 + X*(B3 + ... + X*B12)) *
  519. *               d) To fully utilize the pipeline, Q is separated into   *
  520. *               two independent pieces of roughly equal complexity      *
  521. *                       Q = [ X*S*(B2 + S*(B4 + ... + S*B12)) ] +       *
  522. *                               [ S*S*(B3 + S*(B5 + ... + S*B11)) ]     *
  523. *                                                                       *
  524. *       Step 10. Calculate exp(X)-1 for |X| >= 70 log 2.                *
  525. *               10.1 If X >= 70log2 , exp(X) - 1 = exp(X) for all       *
  526. *               practical purposes. Therefore, go to Step 1 of setox.   *
  527. *               10.2 If X <= -70log2, exp(X) - 1 = -1 for all practical *
  528. *               purposes.                                               *
  529. *               ans := -1                                               *
  530. *               Restore user FPCR                                       *
  531. *               Return ans := ans + 2^(-126). Exit.                     *
  532. *       Notes:  10.2 will always create an inexact and return -1 + tiny *
  533. *               in the user rounding precision and mode.                *
  534. *                                                                       *
  535. *************************************************************************
  536.  
  537. _expm1
  538.         fmove.d         (4,sp),fp0
  539. @expm1
  540. ;--entry point for EXPM1(X), here X is finite, non-zero, non-NaN
  541.  
  542. ;--Step 1.
  543. ;--Step 1.1
  544.         link            a0,#-TEMP_SIZE
  545.         fmove.x         fp0,(SC,a0)
  546.         move.l          (SC,a0),d1              ; load part of input X
  547.         and.l           #$7FFF0000,d1           ; biased expo. of X
  548.         cmp.l           #$3FFD0000,d1           ; 1/4
  549.         bge.b           .EM1CON1                ; |X| >= 1/4
  550.         bra             .EM1SM
  551.  
  552. .EM1CON1
  553. ;--Step 1.3
  554. ;--The case |X| >= 1/4
  555.         move.w          (SC+4,a0),d1            ; expo. and partial sig. of |X|
  556.         cmp.l           #$4004C215,d1           ; 70log2 rounded up to 16 bits
  557.         ble.b           .EM1MAIN                ; 1/4 <= |X| <= 70log2
  558.         bra             .EM1BIG
  559.  
  560. .EM1MAIN
  561. ;--Step 2.
  562. ;--This is the case:    1/4 <= |X| <= 70 log2.
  563.  
  564.         fmove.x         fp0,fp1
  565.         fmul.s          #$42B8AA3B,fp0          ; 64/log2 * X
  566.         fmovem.x        fp2/fp3,-(sp)           ; save fp2 {fp2/fp3}
  567.         fmove.l         fp0,d1                  ; N = int( X * 64/log2 )
  568.         lea             (EEXPTBL,pc),a1
  569.         fmove.l         d1,fp0                  ; convert to floating-format
  570.         move.l          d1,(L_SCR1,a0)          ; save N temporarily
  571.         and.l           #$3F,d1                 ; D0 is J = N mod 64
  572.         lsl.l           #4,d1
  573.         add.l           d1,a1                   ; address of 2^(J/64)
  574.         move.l          (L_SCR1,a0),d1
  575.         asr.l           #6,d1                   ; D0 is M
  576.         move.l          d1,(L_SCR1,a0)          ; save a copy of M
  577.  
  578. ;--Step 3.
  579. ;--fp1,fp2 saved on the stack. fp0 is N, fp1 is X,
  580. ;--a0 points to 2^(J/64), D0 and a1 both contain M
  581.         fmove.x         fp0,fp2
  582.         fmul.s          #$BC317218,fp0          ; N * L1, L1 = lead(-log2/64)
  583.         fmul.x          (L2,pc),fp2             ; N * L2, L1+L2 = -log2/64
  584.         fadd.x          fp1,fp0                 ; X + N*L1
  585.         fadd.x          fp2,fp0                 ; fp0 is R, reduced arg.
  586.         add.w           #$3FFF,d1               ; D0 is biased expo. of 2^M
  587.  
  588. ;--Step 4.
  589. ;--WE NOW COMPUTE EXP(R)-1 BY A POLYNOMIAL
  590. ;-- R + R*R*(A1 + R*(A2 + R*(A3 + R*(A4 + R*(A5 + R*A6)))))
  591. ;--TO FULLY UTILIZE THE PIPELINE, WE COMPUTE S = R*R
  592. ;--[R*S*(A2+S*(A4+S*A6))] + [R+S*(A1+S*(A3+S*A5))]
  593.  
  594.         fmove.x         fp0,fp1
  595.         fmul.x          fp1,fp1                 ; fp1 IS S = R*R
  596.  
  597.         fmove.s         #$3950097B,fp2          ; fp2 IS a6
  598.  
  599.         fmul.x          fp1,fp2                 ; fp2 IS S*A6
  600.         fmove.x         fp1,fp3
  601.         fmul.s          #$3AB60B6A,fp3          ; fp3 IS S*A5
  602.  
  603.         fadd.d          (EM1A4,pc),fp2          ; fp2 IS A4+S*A6
  604.         fadd.d          (EM1A3,pc),fp3          ; fp3 IS A3+S*A5
  605.         move.w          d1,(SC,a0)              ; SC is 2^(M) in extended
  606.         move.l          #$80000000,(SC+4,a0)
  607.         clr.l           (SC+8,a0)
  608.  
  609.         fmul.x          fp1,fp2                 ; fp2 IS S*(A4+S*A6)
  610.         move.l          (L_SCR1,a0),d1          ; D0 is M
  611.         neg.w           d1                      ; D0 is -M
  612.         fmul.x          fp1,fp3                 ; fp3 IS S*(A3+S*A5)
  613.         add.w           #$3FFF,d1               ; biased expo. of 2^(-M)
  614.         fadd.d          (EM1A2,pc),fp2          ; fp2 IS A2+S*(A4+S*A6)
  615.         fadd.s          #$3F000000,fp3          ; fp3 IS A1+S*(A3+S*A5)
  616.         fmul.x          fp1,fp2                 ; fp2 IS S*(A2+S*(A4+S*A6))
  617.         or.w            #$8000,d1               ; signed/expo. of -2^(-M)
  618.         move.w          d1,(ONEBYSC,a0)         ; OnebySc is -2^(-M)
  619.         move.l          #$80000000,(ONEBYSC+4,a0)
  620.         clr.l           (ONEBYSC+8,a0)
  621.         fmul.x          fp3,fp1                 ; fp1 IS S*(A1+S*(A3+S*A5))
  622.  
  623.         fmul.x          fp0,fp2                 ; fp2 IS R*S*(A2+S*(A4+S*A6))
  624.         fadd.x          fp1,fp0                 ; fp0 IS R+S*(A1+S*(A3+S*A5))
  625.  
  626.         fadd.x          fp2,fp0                 ; fp0 IS EXP(R)-1
  627.  
  628.         fmovem.x        (sp)+,fp2/fp3           ; fp2 restored {fp2/fp3}
  629.  
  630. ;--Step 5
  631. ;--Compute 2^(J/64)*p
  632.  
  633.         fmul.x          (a1),fp0                ; 2^(J/64)*(Exp(R)-1)
  634.  
  635. ;--Step 6
  636. ;--Step 6.1
  637.         move.l          (L_SCR1,a0),d1          ; retrieve M
  638.         cmp.l           #63,d1
  639.         ble.b           .MLE63
  640. ;--Step 6.2     M >= 64
  641.         fmove.s         (12,a1),fp1             ; fp1 is t
  642.         fadd.x          (ONEBYSC,a0),fp1        ; fp1 is t+OnebySc
  643.         fadd.x          fp1,fp0                 ; p+(t+OnebySc), fp1 released
  644.         fadd.x          (a1),fp0                ; T+(p+(t+OnebySc))
  645.         bra             .EM1SCALE
  646. .MLE63:
  647. ;--Step 6.3     M <= 63
  648.         cmp.l           #-3,d1
  649.         bge.b           .MGEN3
  650. .MLTN3:
  651. ;--Step 6.4     M <= -4
  652.         fadd.s          (12,a1),fp0             ; p+t
  653.         fadd.x          (a1),fp0                ; T+(p+t)
  654.         fadd.x          (ONEBYSC,a0),fp0        ; OnebySc + (T+(p+t))
  655.         bra             .EM1SCALE
  656. .MGEN3
  657. ;--Step 6.5     -3 <= M <= 63
  658.         fmove.x         (a1)+,fp1               ; fp1 is T
  659.         fadd.s          (a1),fp0                ; fp0 is p+t
  660.         fadd.x          (ONEBYSC,a0),fp1        ; fp1 is T+OnebySc
  661.         fadd.x          fp1,fp0                 ; (T+OnebySc)+(p+t)
  662. .EM1SCALE
  663. ;--Step 6.6
  664.         fmul.x          (SC,a0),fp0
  665.         unlk            a0
  666.         rts
  667.  
  668. .EM1SM
  669. ;--Step 7       |X| < 1/4.
  670.         cmp.l           #$3FBE0000,d1           ; 2^(-65)
  671.         bge.b           .EM1POLY
  672.  
  673. .EM1TINY
  674. ;--Step 8       |X| < 2^(-65)
  675.         cmp.l           #$00330000,d1           ; 2^(-16312)
  676.         blt.b           .EM12TINY
  677. ;--Step 8.2
  678.         move.l          #$80010000,(SC,a0)      ; SC is -2^(-16382)
  679.         move.l          #$80000000,(SC+4,a0)
  680.         clr.l           (SC+8,a0)
  681.  
  682.         fadd.x          (SC,a0),fp0
  683.         unlk            a0
  684.         rts
  685.  
  686. .EM12TINY
  687. ;--Step 8.3
  688.         fmul.d          (TWO140,pc),fp0
  689.         move.l          #$80010000,(SC,a0)
  690.         move.l          #$80000000,(SC+4,a0)
  691.         clr.l           (SC+8,a0)
  692.         fadd.x          (SC,a0),fp0
  693.         fmul.d          (TWON140,pc),fp0
  694.         unlk            a0
  695.         rts
  696.  
  697. .EM1POLY
  698. ;--Step 9       exp(X)-1 by a simple polynomial
  699.         fmul.x          fp0,fp0                 ; fp0 is S := X*X
  700.         fmovem.x        fp2/fp3,-(sp)           ; save fp2 {fp2/fp3}
  701.         fmove.s         #$2F30CAA8,fp1          ; fp1 is B12
  702.         fmul.x          fp0,fp1                 ; fp1 is S*B12
  703.         fmove.s         #$310F8290,fp2          ; fp2 is B11
  704.         fadd.s          #$32D73220,fp1          ; fp1 is B10+S*B12
  705.         fmul.x          fp0,fp2                 ; fp2 is S*B11
  706.         fmul.x          fp0,fp1                 ; fp1 is S*(B10 + ...
  707.  
  708.         fadd.s          #$3493F281,fp2          ; fp2 is B9+S*...
  709.         fadd.d          (EM1B8,pc),fp1          ; fp1 is B8+S*...
  710.  
  711.         fmul.x          fp0,fp2                 ; fp2 is S*(B9+...
  712.         fmul.x          fp0,fp1                 ; fp1 is S*(B8+...
  713.  
  714.         fadd.d          (EM1B7,pc),fp2          ; fp2 is B7+S*...
  715.         fadd.d          (EM1B6,pc),fp1          ; fp1 is B6+S*...
  716.  
  717.         fmul.x          fp0,fp2                 ; fp2 is S*(B7+...
  718.         fmul.x          fp0,fp1                 ; fp1 is S*(B6+...
  719.  
  720.         fadd.d          (EM1B5,pc),fp2          ; fp2 is B5+S*...
  721.         fadd.d          (EM1B4,pc),fp1          ; fp1 is B4+S*...
  722.  
  723.         fmul.x          fp0,fp2                 ; fp2 is S*(B5+...
  724.         fmul.x          fp0,fp1                 ; fp1 is S*(B4+...
  725.  
  726.         fadd.d          (EM1B3,pc),fp2          ; fp2 is B3+S*...
  727.         fadd.x          (EM1B2,pc),fp1          ; fp1 is B2+S*...
  728.  
  729.         fmul.x          fp0,fp2                 ; fp2 is S*(B3+...
  730.         fmul.x          fp0,fp1                 ; fp1 is S*(B2+...
  731.  
  732.         fmul.x          fp0,fp2                 ; fp2 is S*S*(B3+...)
  733.         fmul.x          (SC,a0),fp1             ; fp1 is X*S*(B2...
  734.  
  735.         fmul.s          #$3F000000,fp0          ; fp0 is S*B1
  736.         fadd.x          fp2,fp1                 ; fp1 is Q
  737.  
  738.         fmovem.x        (sp)+,fp2/fp3           ; fp2 restored {fp2/fp3}
  739.  
  740.         fadd.x          fp1,fp0                 ; fp0 is S*B1+Q
  741.  
  742.         fadd.x          (SC,a0),fp0
  743.         unlk            a0
  744.         rts
  745.  
  746. .EM1BIG
  747. ;--Step 10      |X| > 70 log2
  748.         move.l          (SC,a0),d1
  749.         unlk            a0
  750.         tst.l           d1
  751.         bgt.w           @exp
  752.  
  753. ;--Step 10.2
  754.         fmove.s         #$BF800000,fp0          ; fp0 is -1
  755.         fadd.s          #$00800000,fp0          ; -1 + 2^(-126)
  756.         unlk            a0
  757.         rts
  758.